knitr::opts_chunk$set( collapse = TRUE, eval=FALSE, comment = "#>" ) documents_to_latex<-function(out_docs, include_description=TRUE, include_ocr=TRUE, pandoc_output=FALSE){ for (doc in 1:nrow(out_docs)){ if(pandoc_output){ cat("\n\n") cat(paste0("## ", reportRx::sanitizestr(out_docs[doc, "title"]), " (id: ", out_docs[doc, "id"], ")")) cat("\n\n") } else { cat(paste0("\\subsection{", reportRx::sanitizestr(out_docs[doc, "title"]), " (id: ", out_docs[doc, "id"], ")}")) cat(paste0(" \n")) } if(include_description){ if(pandoc_output){ cat("\n\n") cat(paste0("### description")) cat("\n\n") } else { cat(paste0("\\subsubsection{description}")) cat(paste0(" \n")) } #cat(Hmisc::latexTranslate(out_docs[doc, "description"])) #knitr::knit_print(out_docs[doc, "description"]) cat(reportRx::sanitizestr(out_docs[doc, "description"])) cat("\n\n") } if(include_ocr){ if(pandoc_output){ cat("\n\n") cat(paste0("### OCR")) cat("\n\n") } else { cat(paste0("\\subsubsection{OCR}")) cat(paste0(" \n")) } #knitr::knit_print(out_docs[doc, "ocr"]) cat(reportRx::sanitizestr(out_docs[doc, "ocr"])) if(pandoc_output){ cat(" ") } else { cat(" \n") } } } }
This vignette covers the process of article classification using machine learning.
library(durhamevp) ## also using tidyverse functions library(tidyverse)
alldocs<-get_document("all") alldocs<-alldocs %>% dplyr::mutate(std_url = sub("download/", "", url)) docallocs<-get_allocation("all", allocation_type = "coding") docallocs<-left_join(docallocs, alldocs, by=c("document_id"="id")) classdocs<-filter(docallocs, coding_complete==1, electoralviolence_nature %in% c("false", "true")) classdocs<-classdocs %>% dplyr::mutate(std_url = sub("download/", "", url)) %>% dplyr::mutate(unclass=0, classified=1) %>% dplyr::mutate(EV_article=ifelse(electoralviolence_nature=="true", 1, 0)) systematic_search_terms<-c("incident", "riot", "mob", "rough", "disturbance", "husting", "adjourn", "magistrate", "police", "prison", "candidate", "election", "party") all_searches<-get_archivesearches() systematic_searches<-all_searches%>% dplyr::select(id, search_text, archive_date_start, archive_date_end) %>% filter(search_text %in% systematic_search_terms) results_systematic_searches<-get_archivesearchresults(archive_search_id = systematic_searches$id) %>% left_join(all_searches, by=c("archive_search_id"="id")) %>% mutate(std_url = sub("download/", "", url)) is_classified <-results_systematic_searches$std_url %in% classdocs$std_url classified_results<-results_systematic_searches[is_classified,]
class_res_dfm<-durhamevp::searches_to_dfm(classified_results) the_urls<-quanteda::docvars(class_res_dfm, "std_url") ordered_cr<-classified_results[match(quanteda::docvars(class_res_dfm, "std_url"), classified_results$std_url),] summary_class <- classdocs %>% group_by(std_url, document_id) %>% mutate(EV_article=electoralviolence_nature=="true") %>% summarize(EV_article=max(EV_article), sum_EV_article=sum(EV_article), n=n()) ordered_classified<-summary_class[match(quanteda::docvars(class_res_dfm, "std_url"), summary_class$std_url),] quanteda::docvars(class_res_dfm, "EV_article")<-ordered_classified$EV_article # use 85 per cent of the classified data to train classifier # the other 15 per cent are used to as the testing set n_train <- round(nrow(class_res_dfm)*.85) the_sets<-split_dfm(class_res_dfm, n_train = n_train) testing_urls<-quanteda::docvars(the_sets$testing_set, "std_url") training_urls<-quanteda::docvars(the_sets$training_set, "std_url") # train xgboost keyword classifier classifier<-durhamevp::evp_classifiers(the_sets$training_set, "xgboost", "EV_article", "uniform") # train naive bayes keyword classifier classifier_nb<-durhamevp::evp_classifiers(the_sets$training_set, "nb", "EV_article", "uniform")
Note this step should be regularly implemented to check that performance of classifier is improving not degrading. However, it is not necessary for the process.
# add predictions to the training and testing sets quanteda::docvars(the_sets$training_set, "predicted_keywords")<-predict(classifier, newdata = the_sets$training_set, type="class") quanteda::docvars(the_sets$testing_set, "predicted_keywords")<-predict(classifier, newdata = the_sets$testing_set, type="class") quanteda::docvars(the_sets$testing_set, "predicted_keywords_nb")<-predict(classifier_nb, newdata = the_sets$testing_set, type="prob")[,2] quanteda::docvars(the_sets$testing_set, "pred_class_keywords")<-factor(as.numeric(quanteda::docvars(the_sets$testing_set, "predicted_keywords")>.5)) results_table<-as.data.frame.matrix(caret::confusionMatrix(data=quanteda::docvars(the_sets$testing_set, "pred_class_keywords"), reference=factor(quanteda::docvars(the_sets$testing_set, "EV_article")), mode="prec_recall", positive="1")$table) ## **** probably delete**** tmp_train<-quanteda::docvars(the_sets$training_set) p1<-ggplot(tmp_train, aes(predicted_keywords, EV_article))+ geom_point(position=position_jitter(height=.1))+ stat_smooth(method="glm", method.args = list(family="binomial"))+ ggtitle("performance of the xgboost keyword classifier on the training set") tmp_test<-quanteda::docvars(the_sets$testing_set) p2<-ggplot(tmp_test, aes(predicted_keywords, EV_article))+ geom_point(position=position_jitter(height=.1))+ stat_smooth(method="glm", method.args = list(family="binomial"))+ ggtitle("performance of the xgboost keyword classifier on the testing set") p2 p3<-ggplot(tmp_test, aes(predicted_keywords_nb, EV_article))+ geom_point(position=position_jitter(height=.1))+ stat_smooth(method="glm", method.args = list(family="binomial"))+ ggtitle("performance of the naive bayes keyword classifier on the testing set") print(p3) p4<-ggplot(tmp_test, aes(y=EV_article))+ geom_point(colour="red", position=position_jitter(height=.1), aes(x=predicted_keywords_nb))+ stat_smooth(method="glm",colour="red", method.args = list(family="binomial"), aes(x=predicted_keywords_nb))+ geom_point(colour="green", position=position_jitter(height=.1), aes(x=predicted_keywords))+ stat_smooth(method="glm",colour="green", method.args = list(family="binomial"), aes(x=predicted_keywords))+ ggtitle("performance of the naive bayes keyword classifier on the testing set")
We do not examine status 1 or status 3 documents.
undownloaded_candidates<-get_candidate_documents(status =c("0", "2", "4", "5", "6", "7", "8", ""), include_ocr=FALSE) undownloaded_candidates<- undownloaded_candidates %>% dplyr::mutate(std_url = sub("download/", "", url)) need_classifying <-results_systematic_searches$std_url %in% undownloaded_candidates$std_url toclassify_results<-results_systematic_searches[need_classifying,] toclass_res_dfm<-durhamevp::searches_to_dfm(toclassify_results) classify_urls<-quanteda::docvars(toclass_res_dfm, "std_url") toclass_othervars <- toclassify_results %>% select(newspaper_title, publication_date, publication_location, type, std_url) %>% group_by(std_url,newspaper_title, publication_date, publication_location, type) %>% summarize () ordered_tcov<-toclass_othervars[match(quanteda::docvars(toclass_res_dfm, "std_url"), toclass_othervars$std_url),] quanteda::docvars(toclass_res_dfm, "publication_date")<-ordered_tcov$publication_date quanteda::docvars(toclass_res_dfm, "publication_location")<-ordered_tcov$publication_location quanteda::docvars(toclass_res_dfm, "newspaper_title")<-ordered_tcov$newspaper_title quanteda::docvars(toclass_res_dfm, "type")<-ordered_tcov$type # this step reorders the dfm to have the columns in the same order as the training_set dfm toclass_res_dfm<-quanteda::dfm_select(toclass_res_dfm, the_sets$training_set) # this step actually implements the prediction quanteda::docvars(toclass_res_dfm, "predicted_keywords")<-predict(classifier, newdata = toclass_res_dfm, type="class")
The results set contains many references to the same document (perhaps with different descriptions). The next step is to identify the unique candidate documents which these results relate to. Add the classification (where this is a subset the classification is always 1)
undownloaded_candidates_with_kwpred<-left_join(undownloaded_candidates, quanteda::docvars(toclass_res_dfm), by=c("std_url", "publication_date", "publication_location", "type")) # add election name undownloaded_candidates_with_kwpred<-undownloaded_candidates_with_kwpred %>% group_by(publication_date) %>% summarise() %>% mutate(election_name=date_to_election(publication_date)) %>% right_join(undownloaded_candidates_with_kwpred, by="publication_date") ## If working by election subset to 1874 (or other desired election) kwclass_1874<-undownloaded_candidates_with_kwpred %>% filter(election_name=="1874") %>% arrange(-predicted_keywords) hist(kwclass_1874$predicted_keywords) kwclass_1874_top5000<-kwclass_1874 %>% slice(1:5000) %>% select(id, url, publication_title, description, status, g_status, title, status_writer, predicted_keywords) kwclass_1874_top5000$status<-1 kwclass_1874_top5000$status_writer<-"xgboost_keyword"
to_csv<-kwclass_1874_top5000[,c("id", "url", "publication_title", "description", "status", "g_status", "title", "status_writer", "predicted_keywords")] csv_filename<-gsub(" ", "_", paste0("britishnewspaperarchive_", format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), ".csv")) write.csv(to_csv, file=csv_filename, row.names = FALSE)
Then Download OCR of Documents Identified by Keyword Classifier
#classdocs$ocr classified_corpus<-quanteda::corpus(classdocs[,c("ocr", "EV_article")], text_field = "ocr") ocr_class_dfm<-durhamevp::preprocess_corpus(classified_corpus, min_termfreq=20, min_docfreq=20, stem=FALSE) ocr_sets<-durhamevp::split_dfm(ocr_class_dfm, round(nrow(ocr_class_dfm)*.85)) ocr_classifier<-durhamevp::evp_classifiers(ocr_sets$training_set, classifier_type = "xgboost", training_classify_var = "EV_article") quanteda::docvars(ocr_sets$testing_set, "ocr_predict_xgb")<-predict(ocr_classifier, ocr_sets$testing_set) quanteda::docvars(ocr_sets$testing_set, "ocr_predict_class_xgb")<-as.numeric(quanteda::docvars(ocr_sets$testing_set, "ocr_predict_xgb")>.5) ggplot(quanteda::docvars(ocr_sets$testing_set), aes(ocr_predict_xgb, EV_article))+ geom_point(position="jitter")+ geom_smooth(method="glm", method.args=list(family="binomial")) caret::confusionMatrix(factor(quanteda::docvars(ocr_sets$testing_set, "EV_article")), factor(quanteda::docvars(ocr_sets$testing_set, "ocr_predict_class_xgb")), mode="prec_recall")
kwclass_1874_top5000$id[1:5] needs_ocr_classify<-get_candidate_documents(cand_document_id = kwclass_1874_top5000$id, include_ocr = TRUE) needs_classify_corpus<-quanteda::corpus(needs_ocr_classify, text_field = "ocr") needs_classify_dfm<-durhamevp::preprocess_corpus(needs_classify_corpus, min_termfreq=20, min_docfreq=20, stem=FALSE) needs_classify_dfm<-quanteda::dfm_select(needs_classify_dfm, ocr_sets$training_set) # this step actually implements the prediction quanteda::docvars(needs_classify_dfm, "predicted_ocr")<-predict(ocr_classifier, newdata = needs_classify_dfm, type="class") hist(quanteda::docvars(needs_classify_dfm, "predicted_ocr")) classified_by_ocr<-quanteda::docvars(needs_classify_dfm) aa<-left_join(classified_by_ocr, select(kwclass_1874_top5000, id, url, predicted_keywords)) %>% dplyr::mutate(std_url = sub("download/", "", url)) %>% arrange(-predicted_ocr) ggplot(aa, aes(predicted_ocr))+ geom_histogram() ggplot(aa, aes(predicted_keywords, predicted_ocr))+ geom_point()+ stat_smooth(method="lm") bb<-aa[aa$predicted_keywords>.7&aa$predicted_ocr<.1,] f1<-lm(predicted_ocr~predicted_keywords, aa) stargazer::stargazer(f1, type="text") cor(aa$predicted_keywords, aa$predicted_ocr) importance <- xgboost::xgb.importance(feature_names = colnames(ocr_sets$training_set), model = ocr_classifier)
download_these<-classified_by_ocr[,c("id", "url", "publication_title", "description", "status", "g_status", "title", "status_writer", "predicted_ocr")] %>% filter(predicted_ocr>.75) %>% mutate(status=1, status_writer="xgb_ocr") %>% as_tibble() csv_filename<-gsub(" ", "_", paste0("britishnewspaperarchive_", format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), ".csv")) write.csv(download_these, file=csv_filename, row.names = FALSE)
top_ones<-quanteda::docvars(toclass_res_dfm, "predicted_keywords")>.9 quanteda::docvars(toclass_res_dfm, "std_url")[top_ones][1:5] as_df<-as.tbl(quanteda::docvars(toclass_res_dfm)) %>% mutate(pubyear=lubridate::year(publication_date)) #system.time(as_df <- as_df %>% # mutate(election=date_to_election(publication_date))) as_df<-as_df %>% group_by(publication_date) %>% summarise() %>% mutate(election_name=date_to_election(publication_date)) %>% right_join(as_df, by="publication_date") pp1<-as_df %>% left_join(election_dates, by="election_name") %>% filter(keywords_13_complete=="Y") %>% ggplot(aes(election_name, predicted_keywords, group=election_name)) + geom_boxplot() pp1<-as_df %>% left_join(election_dates, by="election_name") %>% filter(keywords_13_complete=="Y") %>% ggplot(aes(election_name, predicted_keywords, group=election_name)) + geom_violin(trim=FALSE) as_df <- as_df %>% mutate(electionn=as.numeric(election)) prob_class<-function(x){ case_when( x>.75 ~ "highly likely (>.75)", x>.5 ~ "likely (>.5 & <.75)", x>.25 ~ "unlikely (<.5 & >.25)", x<.25 ~ "very unlikely(<.25)", TRUE ~ NA_character_ ) } pp4<-as_df %>% mutate(prob_EV_keywords=prob_class(predicted_keywords)) %>% mutate(electionn=as.numeric(election_name)) %>% filter(electionn<1875) %>% group_by(election_name, electionn, prob_EV_keywords) %>% tally() %>% ggplot(aes(electionn, n))+ facet_wrap(~prob_EV_keywords, scales="free")+ geom_point()+ geom_line()+ ylim(0, NA) + ggtitle("Number of Currently Undownloaded Articles in Each Election by EV_Article Likelihood (Keyword Classifier)") pp2<-as_df %>% group_by(electionn) %>% tally()%>% ggplot(aes(electionn, n)) + geom_point() + geom_line() pp2 dim(results_systematic_searches) rss<-results_systematic_searches %>% group_by(publication_date) %>% summarise() %>% mutate(election_name=date_to_election(publication_date)) %>% right_join(results_systematic_searches, by="publication_date") pp3<-rss %>% group_by(election_name, search_text) %>% tally()%>% mutate(electionn=as.numeric(election_name)) %>% left_join(durhamevp::election_dates, by="election_name") %>% mutate(articles_per_day=n/monthsearch_duration) %>% filter(keywords_13_complete=="Y") %>% ggplot(aes(electionn, articles_per_day)) + facet_wrap(~search_text, scales="free")+ geom_point() + geom_line() + ylim(0, NA) + ggtitle("Articles (per day) Containing 13 Systematic Search Terms During Month Around Elections")+ theme_bw() pp3 #link to searchers candocs<-get_candidate_documents(status =c("0","1", "2", "4", "5", "6", "7", "8"), include_ocr=FALSE) candocs<-candocs %>% dplyr::mutate(std_url = sub("download/", "", url)) candocs$EV_article<-ifelse(candocs$g_status %in% c("1", "3"), 1, 0) candocs_training_set<-candocs[!candocs$std_url %in% testing_urls, ] candocs_testing_set<-candocs[candocs$std_url %in% testing_urls, ] descript_test<-classified_results[classified_results$std_url %in% testing_urls,] select_descript_xgb<-classifier_selection_description(candocs_training_set, descript_test, classifier_type="xgboost", return_logical = TRUE, logical_to_prob=TRUE, stem=FALSE, min_docfreq=20, min_termfreq=20) descript_test <- descript_test %>% mutate(xgb_select=select_descript_xgb) compare_docs<-descript_test%>% inner_join(candocs_testing_set, by="description") table(compare_docs$xgb_select>.5, compare_docs$g_status) ggplot(compare_docs, aes(xgb_select, as.numeric(g_status==1)))+ geom_point(position="jitter")+ stat_smooth(method="glm", method.args = list(family="binomial"))+ ggtitle("Performance of the xgboost descrition classifier on the testing set") compare_docs<-descript_test%>% group_by(std_url) %>% summarise(maxselect=max(xgb_select)) %>% left_join(candocs_testing_set, by="std_url") check_descript_xgb<-classifier_selection_description(candocs_training_set, candocs_training_set, classifier_type="xgboost", return_logical = TRUE, logical_to_prob=TRUE, stem=FALSE, min_docfreq=50, min_termfreq=50) descript_test <- candocs_training_set %>% mutate(xgb_select=check_descript_xgb) compare_docs<-descript_test%>% inner_join(candocs_training_set, by="description") table(compare_docs$xgb_select>.5, compare_docs$g_status.x) ggplot(compare_docs, aes(xgb_select, as.numeric(g_status.x=="1")))+ geom_point(position="jitter")+ stat_smooth(method="glm", method.args = list(family="binomial"))+ ggtitle("Performance of the xgboost descrition classifier on the training set")
aa<-left_join(needs_ocr_classify, classified_by_ocr, by=c("id", "url")) %>% filter(predicted_ocr>.75) documents_to_latex(sample_n(aa, 30), include_ocr=TRUE, pandoc_output=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.